home *** CD-ROM | disk | FTP | other *** search
Text File | 1994-12-08 | 71.9 KB | 1,966 lines | [TEXT/R*ch] |
- C.S.M.P. Digest Mon, 21 Dec 92 Volume 1 : Issue 225
-
- Today's Topics:
-
- Help! making an assembly routine faster
-
-
-
- The Comp.Sys.Mac.Programmer Digest is moderated by Michael A. Kelly.
-
- The digest is a collection of article threads from the internet newsgroup
- comp.sys.mac.programmer. It is designed for people who read c.s.m.p. semi-
- regularly and want an archive of the discussions. If you don't know what a
- newsgroup is, you probably don't have access to it. Ask your systems
- administrator(s) for details. You can post articles to any newsgroup by
- mailing your article to newsgroup@ucbvax.berkeley.edu. So, to post an
- article to comp.sys.mac.programmer, you mail it to
- comp-sys-mac-programmer@ucbvax.berkeley.edu. Note the '-' instead of '.'
- in the newsgroup name.
-
- Each issue of the digest contains one or more sets of articles (called
- threads), with each set corresponding to a 'discussion' of a particular
- subject. The articles are not edited; all articles included in this digest
- are in their original posted form (as received by our news server at
- cs.uoregon.edu). Article threads are not added to the digest until the last
- article added to the thread is at least one month old (this is to ensure that
- the thread is dead before adding it to the digest). Article threads that
- consist of only one message are generally not included in the digest.
-
- The entire digest is available for anonymous ftp from ftp.cs.uoregon.edu
- [128.223.8.8] in the directory /pub/mac/csmp-digest. Be sure to read the
- file /pub/mac/csmp-digest/README before downloading any files. The most
- recent issues are available from sumex-aim.stanford.edu [36.44.0.6] in the
- directory /info-mac/digest/csmp. If you don't have ftp capability, the sumex
- archive has a mail server; send a message with the text '$MACarch help' (no
- quotes) to LISTSERV@ricevm1.rice.edu for more information.
-
- The digest is also available via email. Just send a note saying that you
- want to be on the digest mailing list to mkelly@cs.uoregon.edu, and you will
- automatically receive each new issue as it is created. Sorry, back issues
- are not available through the mailing list.
-
- Send administrative mail to mkelly@cs.uoregon.edu.
-
-
- -------------------------------------------------------
-
- From: mkelly@mystix.cs.uoregon.edu (Michael A. Kelly)
- Subject: Help! making an assembly routine faster
- Organization: High Risk Ventures
- Date: Sat, 14 Nov 1992 09:19:05 GMT
-
-
- Hey, all you assembly hackers! How can I make this routine faster? As it
- is it's only about 40% faster than CopyMask. (I'm using Think C 5.)
-
-
- /*
- * Quick8CopyMask
- *
- * The QuickXCopyMask family are much faster versions of CopyMask
- * that don't do clipping, dithering, etc. The source and destination
- * PixMaps are expected to have the same bit depth. The X in the name
- * represents the expected bit depth of the source and destination PixMaps.
- *
- * The mask is excpected to be exactly the same size as the rectangle
- * that is being copied.
- *
- */
-
- void Quick8CopyMask(
- PixMapHandle srcMap,
- PixMapHandle dstMap,
- Ptr mask,
- Point srcPt,
- Point dstPt,
- short width,
- short height )
- {
-
- register char *src;
- register char *dst;
- register long srcNewline;
- register long dstNewline;
- char mode32 = QD32COMPATIBLE;
- short w = (width >> 2) - 1;
- short e = width % 4 - 1;
- short h = height - 1;
-
- // Set up pointers to the beginning of the memory to copy
- // and calculate the newline value for the source and destination
-
- src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3FFF) * srcPt.v + srcPt.h;
- srcNewline = ((*srcMap)->rowBytes & 0x3FFF) - width;
-
- dst = GetPixBaseAddr( dstMap ) + (long) ((*dstMap)->rowBytes & 0x3FFF) * dstPt.v + dstPt.h;
- dstNewline = ((*dstMap)->rowBytes & 0x3FFF) - width;
-
- // Switch into 32 bit addressing mode
-
- SwapMMUMode( &mode32 );
-
- // Copy the rect from the source to the destination
-
- asm {
-
- MOVE.W h, D0 ; put height loop variable in D0
- MOVEA.L src, A0 ; put the source pixmap address in A0
- MOVEA.L dst, A1 ; put the destination address in A1
- MOVEA.L mask, A2 ; put the mask address in A2
- MOVE.L #0, D3
-
- @1: ; copy the next row
- MOVE.W w, D1
-
- @2: ; copy the next four bytes in the row
-
- MOVEQ #0, D2 ; test the next four bits in the mask
- BFTST (A2){D3:1} ; test the bit
- BEQ @bit2 ; if zero, go to bit 2
- ORI.L #0xFF000000, D2 ; else add to pixel mask
- @bit2:
- ADDQ.L #1, D3 ; increment the bit number
- BFTST (A2){D3:1} ; test the bit
- BEQ @bit3 ; if zero, go to bit 3
- ORI.L #0x00FF0000, D2 ; else add to pixel mask
- @bit3:
- ADDQ.L #1, D3 ; increment the bit number
- BFTST (A2){D3:1} ; test the bit
- BEQ @bit4 ; if zero, go to bit 4
- ORI.L #0x0000FF00, D2 ; else add to pixel mask
- @bit4:
- ADDQ.L #1, D3 ; increment the bit number
- BFTST (A2){D3:1} ; test the bit
- BEQ @inc ; if zero, continue
- ORI.L #0x000000FF, D2 ; else add to pixel mask
- @inc:
- ADDQ.L #1, D3 ; increment the bit number
-
-
- ; speeding this next part up would make a big difference, but how?
-
- MOVE.L D2, D4 ; save the mask
- NOT.L D4 ; invert the mask
- AND.L (A0)+, D2 ; compute the pixels to be copied
- AND.L (A1), D4 ; compute the pixels to be saved
- OR.L D2, D4 ; combine the copied and saved pixels
- MOVE.L D4, (A1)+ ; copy the pixels
-
- DBF D1, @2
-
- TST.W e
- BLT @4 ; continue if e is less than 0
-
- MOVE.W e, D1 ; copy the extra bytes, if any
-
- @3: ; copy the next byte
-
- BFTST (A2){D3:1} ; test the next bit in the mask
- BEQ @incb ; if zero, continue
- MOVE.B (A0)+, (A1)+ ; else copy the pixel
- @incb:
- ADDQ.L #1, D3 ; increment the bit number
-
- DBF D1, @3
-
- @4:
- ADDA.L srcNewline, A0 ; bring the src pointer to the start of the next row
- ADDA.L dstNewline, A1 ; bring the dst pointer to the start of the next row
-
- DBF D0, @1
-
- }
-
- // Switch back to the previous addressing mode
-
- SwapMMUMode( &mode32 );
-
- }
-
-
-
-
- I'm new to assembly, as you can probably tell. I'm open to all suggestions.
-
- Thanks,
-
- Mike.
- - --
- _____________________________________________________________________________
- Michael A. Kelly Senior Partner
- mkelly@cs.uoregon.edu High Risk Ventures
- _____________________________________________________________________________
-
- +++++++++++++++++++++++++++
-
- From: jmunkki@vipunen.hut.fi (Juri Munkki)
- Date: 14 Nov 92 20:08:31 GMT
- Organization: Helsinki University of Technology
-
- In article <1992Nov14.091905.29520@cs.uoregon.edu> mkelly@mystix.cs.uoregon.edu (Michael A. Kelly) writes:
- >Hey, all you assembly hackers! How can I make this routine faster? As it
- >is it's only about 40% faster than CopyMask. (I'm using Think C 5.)
-
- This sounds like something I might be able to help with... let's see...
-
- > src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3FFF) * srcPt.v + srcPt.h;
-
- Shouldn't you cast to long before the multiply? It looks to me like you are
- casting the result of a short multiply, but I could be wrong, since I don't
- want to check this from a C book right now.
-
- > MOVE.W h, D0 ; put height loop variable in D0
- > MOVEA.L src, A0 ; put the source pixmap address in A0
- > MOVEA.L dst, A1 ; put the destination address in A1
- > MOVEA.L mask, A2 ; put the mask address in A2
- > MOVE.L #0, D3
- >
- > @1: ; copy the next row
- > MOVE.W w, D1
- >
- > @2: ; copy the next four bytes in the row
- >
- > MOVEQ #0, D2 ; test the next four bits in the mask
- > BFTST (A2){D3:1} ; test the bit
- > BEQ @bit2 ; if zero, go to bit 2
- > ORI.L #0xFF000000, D2 ; else add to pixel mask
- > @bit2:
- > ADDQ.L #1, D3 ; increment the bit number
- > BFTST (A2){D3:1} ; test the bit
- > BEQ @bit3 ; if zero, go to bit 3
- > ORI.L #0x00FF0000, D2 ; else add to pixel mask
- > @bit3:
- > ADDQ.L #1, D3 ; increment the bit number
- > BFTST (A2){D3:1} ; test the bit
- > BEQ @bit4 ; if zero, go to bit 4
- > ORI.L #0x0000FF00, D2 ; else add to pixel mask
- > @bit4:
- > ADDQ.L #1, D3 ; increment the bit number
- > BFTST (A2){D3:1} ; test the bit
- > BEQ @inc ; if zero, continue
- > ORI.L #0x000000FF, D2 ; else add to pixel mask
- > @inc:
- > ADDQ.L #1, D3 ; increment the bit number
-
- Instead of the above code, extract as many bits as you want (I suggest
- 8 bits, but 4 is also ok) and then use this number as an index to a
- table of precalculated masks.
-
- 8 bits is fast, because you don't need to use bitfield instructions to
- retrieve the value. You then grab two masks from bitfield tables, so
- you avoid all that ORI.L stuff and the increments to the bit numbers.
-
- >; speeding this next part up would make a big difference, but how?
-
- You could take care of longword alignment on all reads and writes to
- the destination buffer. This requires quite a bit of extra code, but it
- might be worth it, since memory accesses can become twice as fast for
- the video memory, which is usually very slow. Of course now that the
- processors have data caches (unlike the 68020), it's probably not all
- that critical.
-
- Another possibility is to grab just a few mask bits (like 4, as I
- suggested) at a time and write special code for all the 16 possible
- cases. Use a jump table to select the code to use. In the usual case,
- where the mask is all black, you get a simple Move.l (An)+,(An)+, which
- really should do wonders to this routine. You also have 3 cases where
- you do a Move.w (An)+,(An)+ with some adjustments to the registers, 4
- cases of move.b, one case where you don't do anything, so that only
- leaves you with 7 more complicated cases, where you might want to use a
- constant mask.
-
- I think you can get fairly good performance if you carefully code the
- two cases where you have an empty mask or a full mask. The rest occur
- less often.
-
- You routine wastes most of its time in the mask handling code. You could
- have tested for this by doing timing tests where you replace some part
- of the code with fast dummy code and compare the relative speeds. Those
- parts that execute much faster as the dummy version need more attention
- than those where the difference is small.
-
- - --
- Juri Munkki Windsurf: fast sailing
- jmunkki@hut.fi Macintosh: fast software
-
- +++++++++++++++++++++++++++
-
- From: mkelly@mystix.cs.uoregon.edu (Michael A. Kelly)
- Organization: University of Oregon Computer and Information Sciences Dept.
- Date: Mon, 16 Nov 1992 01:48:50 GMT
-
- In article <1992Nov14.200831.20477@nntp.hut.fi> jmunkki@vipunen.hut.fi (Juri Munkki) writes:
- >In article <1992Nov14.091905.29520@cs.uoregon.edu> mkelly@mystix.cs.uoregon.edu (Michael A. Kelly) writes:
- >>Hey, all you assembly hackers! How can I make this routine faster? As it
- >>is it's only about 40% faster than CopyMask. (I'm using Think C 5.)
- >
- >This sounds like something I might be able to help with... let's see...
- >
- >> src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3FFF) * srcPt.v + srcPt.h;
- >
- >Shouldn't you cast to long before the multiply? It looks to me like you are
- >casting the result of a short multiply, but I could be wrong, since I don't
- >want to check this from a C book right now.
-
- Yep, but if you look closely at the parens, I think you'll find that that's
- what I'm doing.
-
- >Another possibility is to grab just a few mask bits (like 4, as I
- >suggested) at a time and write special code for all the 16 possible
- >cases. Use a jump table to select the code to use.
-
- OK, I did that, and managed to almost triple the speed of my original routine,
- making the new routine about four times as fast as CopyMask. And yet, I'd
- like to make it even faster. So suggestions are welcome.
-
- Someone else suggested that I just make the mask the same depth as the pixmaps,
- so that I could use the mask directly instead of having to extract bits from
- it. This turned out to be slower than the jump table approach, only about
- three times as fast as CopyMask. Of course, the problem could be with my
- assembly skills rather than with the theory.
-
- So, here are the resulting routines. The first uses the jump table approach,
- the second uses the wide mask approach. Can they be made even faster??
-
-
- /*
- * Quick8CopyMask
- *
- * The QuickXCopyMask family are much faster versions of CopyMask
- * that don't do clipping, dithering, etc. The source and destination
- * PixMaps are expected to have the same bit depth. The X in the name
- * represents the expected bit depth of the source and destination PixMaps.
- *
- * The mask is expected to be exactly the same size as the rectangle
- * that is being copied.
- *
- */
-
- void Quick8CopyMask(
- PixMapHandle srcMap,
- PixMapHandle dstMap,
- Ptr mask,
- Point srcPt,
- Point dstPt,
- short width,
- short height )
- {
-
- register char *src;
- register char *dst;
- register long srcNewline;
- register long dstNewline;
- char mode32 = QD32COMPATIBLE;
- short w = (width >> 3) - 1;
- short e = (width & 0x07) - 1;
- short h = height - 1;
-
- // Set up pointers to the beginning of the memory to copy
- // and calculate the newline value for the source and destination
-
- src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3fff) * srcPt.v + srcPt.h;
- srcNewline = ((*srcMap)->rowBytes & 0x3fff) - width;
-
- dst = GetPixBaseAddr( dstMap ) + (long) ((*dstMap)->rowBytes & 0x3fff) * dstPt.v + dstPt.h;
- dstNewline = ((*dstMap)->rowBytes & 0x3fff) - width;
-
- // Switch into 32 bit addressing mode
-
- SwapMMUMode( &mode32 );
-
- // Copy the rect from the source to the destination
-
- asm {
-
- MOVE.W h, D0 ; put height loop variable in D0
- MOVEA.L src, A0 ; put the source pixmap address in A0
- MOVEA.L dst, A1 ; put the destination address in A1
- MOVEA.L mask, A2 ; put the mask address in A2
-
- @1: ; copy the next row
- MOVE.W w, D1
-
- @2: ; copy the next eight bytes in the row
-
- MOVE.B (A2), D2 ; copy the next mask byte
-
- TST.B D2
- BEQ @nocopy ; if zero, don't copy anything
-
- CMPI.B #0xFF, D2
- BNE @hardway ; don't copy everything
-
- MOVE.L (A0)+, (A1)+ ; copy all bytes
- MOVE.L (A0)+, (A1)+
- ADDQ.L #1, A2
- JMP @endloop
-
- @nocopy: ; copy no bytes
- ADDQ.L #8, A0
- ADDQ.L #8, A1
- ADDQ.L #1, A2
- JMP @endloop
-
- @hardway:
- ANDI.L #0xF0, D2 ; mask off the low four bits
- LSR.W #4, D2 ; shift bits 4-7 into bits 0-3
- ADD.W D2, D2 ; double the index
- ADD.W @table(D2.W), D2 ; calculate the address
- JSR @table(D2.W) ; plot four pixels
-
- CLR.L D2 ; clear the mask register
- MOVE.B (A2)+, D2 ; copy the next mask byte
- ANDI.B #0xF, D2 ; mask off the high four bits
- ADD.W D2, D2 ; double the index
- ADD.W @table(D2.W), D2 ; calculate the address
- JSR @table(D2.W) ; plot four pixels
-
- @endloop:
- DBF D1, @2
-
- TST.W e
- BLT @4 ; continue if e is less than 0
-
- MOVE.W e, D1 ; copy the extra bytes, if any
-
- @3: ; copy the next byte
-
- MOVEQ.L #0, D3 ; initialize the bit counter
- BTST D3, (A2) ; test the next bit in the mask
- BEQ @skip ; if zero, continue
- MOVE.B (A0)+, (A1)+ ; else copy the pixel
- JMP @incb
- @skip:
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- @incb:
- ADDQ.L #1, D3 ; increment the bit number
-
- DBF D1, @3
-
- @4:
- ADDA.L srcNewline, A0 ; bring the src pointer to the start of the next row
- ADDA.L dstNewline, A1 ; bring the dst pointer to the start of the next row
-
- DBF D0, @1
-
- JMP @end ; skip to the end
-
- @table:
- DC.W @sub0
- DC.W @sub1
- DC.W @sub2
- DC.W @sub3
- DC.W @sub4
- DC.W @sub5
- DC.W @sub6
- DC.W @sub7
- DC.W @sub8
- DC.W @sub9
- DC.W @sub10
- DC.W @sub11
- DC.W @sub12
- DC.W @sub13
- DC.W @sub14
- DC.W @sub15
-
- @sub0: ; mask = 0000, draw nothing
- ADDQ.L #4, A0
- ADDQ.L #4, A1
- RTS
-
- @sub1: ; mask = 0001
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- MOVE.B (A0)+, (A1)+
- RTS
-
- @sub2: ; mask = 0010
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.B (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- RTS
-
- @sub3: ; mask = 0011
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.W (A0)+, (A1)+
- RTS
-
- @sub4: ; mask = 0100
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- RTS
-
- @sub5: ; mask = 0101
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- RTS
-
- @sub6: ; mask = 0110
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- RTS
-
- @sub7: ; mask = 0111
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- MOVE.W (A0)+, (A1)+
- RTS
-
- @sub8: ; mask = 1000
- MOVE.B (A0)+, (A1)+
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- RTS
-
- @sub9: ; mask = 1001
- MOVE.B (A0)+, (A1)+
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.B (A0)+, (A1)+
- RTS
-
- @sub10: ; mask = 1010
- MOVE.B (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- RTS
-
- @sub11: ; mask = 1011
- MOVE.B (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.W (A0)+, (A1)+
- RTS
-
- @sub12: ; mask = 1100
- MOVE.W (A0)+, (A1)+
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- RTS
-
- @sub13: ; mask = 1101
- MOVE.W (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- RTS
-
- @sub14: ; mask = 1110
- MOVE.W (A0)+, (A1)+
- MOVE.B (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- RTS
-
- @sub15: ; mask = 1111
- MOVE.L (A0)+, (A1)+
- RTS
-
- @end:
-
- }
-
- // Switch back to the previous addressing mode
-
- SwapMMUMode( &mode32 );
-
- }
-
-
-
- And the wide mask approach:
-
-
- void Quick8CopyMask(
- PixMapHandle srcMap,
- PixMapHandle dstMap,
- Ptr mask,
- Point srcPt,
- Point dstPt,
- short width,
- short height )
- {
-
- register char *src;
- register char *dst;
- register long srcNewline;
- register long dstNewline;
- char mode32 = QD32COMPATIBLE;
- short w = (width >> 2) - 1;
- short e = (width & 0x3) - 1;
- short h = height - 1;
-
- // Set up pointers to the beginning of the memory to copy
- // and calculate the newline value for the source and destination
-
- src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3fff) * srcPt.v + srcPt.h;
- srcNewline = ((*srcMap)->rowBytes & 0x3fff) - width;
-
- dst = GetPixBaseAddr( dstMap ) + (long) ((*dstMap)->rowBytes & 0x3fff) * dstPt.v + dstPt.h;
- dstNewline = ((*dstMap)->rowBytes & 0x3fff) - width;
-
- // Switch into 32 bit addressing mode
-
- SwapMMUMode( &mode32 );
-
- // Copy the rect from the source to the destination
-
- asm {
-
- MOVE.W h, D0 ; put height loop variable in D0
- MOVEA.L src, A0 ; put the source pixmap address in A0
- MOVEA.L dst, A1 ; put the destination address in A1
- MOVEA.L mask, A2 ; put the mask address in A2
-
- @1: ; copy the next row
- MOVE.W w, D1
-
- @2: ; copy the next four bytes in the row
-
- MOVE.L (A2)+, D2 ; copy the mask to D2
- MOVE.L D2, D4 ; save the mask
- NOT.L D4 ; invert the mask
- AND.L (A0)+, D2 ; compute the pixels to be copied
- AND.L (A1), D4 ; compute the pixels to be saved
- OR.L D2, D4 ; combine the copied and saved pixels
- MOVE.L D4, (A1)+ ; copy the pixels
-
- DBF D1, @2
-
- TST.W e
- BLT @4 ; continue if e is less than 0
-
- MOVE.W e, D1 ; copy the extra bytes, if any
-
- @3: ; copy the next byte
-
- MOVE.B (A2)+, D2 ; copy the mask to D2
- MOVE.B D2, D4 ; save the mask
- NOT.B D4 ; invert the mask
- AND.B (A0)+, D2 ; compute the pixels to be copied
- AND.B (A1), D4 ; compute the pixels to be saved
- OR.B D2, D4 ; combine the copied and saved pixels
- MOVE.B D4, (A1)+ ; copy the pixels
-
- DBF D1, @3
-
- @4:
- ADDA.L srcNewline, A0 ; bring the src pointer to the start of the next row
- ADDA.L dstNewline, A1 ; bring the dst pointer to the start of the next row
-
- DBF D0, @1
-
- }
-
- // Switch back to the previous addressing mode
-
- SwapMMUMode( &mode32 );
-
- }
-
-
-
- - --
- _____________________________________________________________________________
- Michael A. Kelly Senior Partner
- mkelly@cs.uoregon.edu High Risk Ventures
- _____________________________________________________________________________
-
- +++++++++++++++++++++++++++
-
- From: jmunkki@vipunen.hut.fi (Juri Munkki)
- Date: 16 Nov 92 19:09:47 GMT
- Organization: Helsinki University of Technology
-
- In article <1992Nov16.014850.28678@cs.uoregon.edu> mkelly@mystix.cs.uoregon.edu (Michael A. Kelly) writes:
- >So, here are the resulting routines. The first uses the jump table approach,
- >the second uses the wide mask approach. Can they be made even faster??
-
- Yes.
-
- > @2: ; copy the next eight bytes in the row
- >
- > MOVE.B (A2), D2 ; copy the next mask byte
- >
- > TST.B D2
-
- A move instruction always does an implied tst, so you can just throw away
- the test instruction.
-
- > BEQ @nocopy ; if zero, don't copy anything
- >
- > CMPI.B #0xFF, D2
- > BNE @hardway ; don't copy everything
-
- An addq.w #1, and then a beq might prove to be faster than the cmp with
- an immediate value. You have to adjust the mask back to its old value,
- if the test fails, but this can be done either with the jump tables
- (not with the ones you are using now, but the longer ones I will suggest
- later in this article) or by a subq.w #1
-
- >
- > MOVE.L (A0)+, (A1)+ ; copy all bytes
- > MOVE.L (A0)+, (A1)+
- > ADDQ.L #1, A2
-
- Do a move.b (A2)+ instead of this instruction. I can't see any reason why
- you can't do the increment there.
-
- > JMP @endloop
-
- Copy the end of the loop here. So that you have the DBF instruction here
- instead of a JMP. Put the jump after the DBF. There's absolutely no reason
- to jump around when you can just use another DBF.
-
- > @nocopy: ; copy no bytes
- > ADDQ.L #8, A0
- > ADDQ.L #8, A1
- > ADDQ.L #1, A2
- > JMP @endloop
-
- Same here as above.
-
- > @hardway:
- > ANDI.L #0xF0, D2 ; mask off the low four bits
- > LSR.W #4, D2 ; shift bits 4-7 into bits 0-3
-
- The AND is totally wasted. The LSR will do the masking for you. This
- is assuming that you can keep the high bytes of D2 cleared. I think
- you should be able to do it. (I think it's already that way.)
-
- You can also eliminate the and and lsr, if you use two 256-entry jump
- tables that simply ignore the high or low 4 bits. The tables will take
- some memory (2 x 4 x 256 bytes), but they are easy to construct with
- copy and paste.
-
- > ADD.W D2, D2 ; double the index
- > ADD.W @table(D2.W), D2 ; calculate the address
- > JSR @table(D2.W) ; plot four pixels
-
- The 68020 has addressing modes that do the multiplication of the index.
- I haven't needed them myself, but I'm fairly certain that you can improve
- this part with the right addressing mode.
-
- Replace the jsr with a LEA An to the return address and a JMP to the
- subroutine. Then jump back with a JMP (An). This is quite a bit faster
- than a JSR/RTS combination, although it's not "good style".
-
- > CLR.L D2 ; clear the mask register
- > MOVE.B (A2)+, D2 ; copy the next mask byte
- > ANDI.B #0xF, D2 ; mask off the high four bits
-
- Use BFEXTU, if you must read the mask again. Remember that you can use
- - -1(A2), if you already incremented A2 or you might be able to account
- for this with the bitfield offset. You can also use constant bitfield
- offsets, if I remember correctly. I think you have some registers that
- you could use, so you could store fairly constant bitfield indices
- there.
-
- > @sub6: ; mask = 0110
- > ADDQ.L #1, A0
- > ADDQ.L #1, A1
- > MOVE.B (A0)+, (A1)+
-
- This should be a move.w
-
- > ADDQ.L #1, A0
- > ADDQ.L #1, A1
- > RTS
- >
- > @sub8: ; mask = 1000
- > MOVE.B (A0)+, (A1)+
- > ADDQ.L #3, A0
- > ADDQ.L #3, A1
- > RTS
-
- A move.b (a0),(a1) along with addq #4 is faster on a 68000, but I
- don't think it matters on new processors. I may be wrong, but you'll
- probably never see the difference.
-
- In the deep mask version, you could unroll the loop. It's kind of
- surprising the the 1 bit mask is actually faster, but it's mostly
- because of the superior algorithm that allows you to directly copy
- 8 bytes at a time in the most common case.
-
- I think you did really well with the assembly. My changes will probably
- not make a big difference. I think 5% is the best you can hope for, but
- it might be as much as 10%. The only way to go beyond this is to make
- the move.l commands aligned on long word destinations, as I mentioned
- in my previous article.
-
- I hope my articles offer proof for the other half of my .signature... :-)
- Can anyone do significantly better? I really love optimizing graphics
- routines.
-
- - --
- Juri Munkki Windsurf: fast sailing
- jmunkki@hut.fi Macintosh: fast software
-
- +++++++++++++++++++++++++++
-
- From: mkelly@mystix.cs.uoregon.edu (Michael A. Kelly)
- Organization: High Risk Ventures
- Date: Wed, 18 Nov 1992 01:08:15 GMT
-
- In article <1992Nov16.190947.9920@nntp.hut.fi> jmunkki@vipunen.hut.fi (Juri Munkki) writes:
- >In article <1992Nov16.014850.28678@cs.uoregon.edu> mkelly@mystix.cs.uoregon.edu (Michael A. Kelly) writes:
- >> CMPI.B #0xFF, D2
- >> BNE @hardway ; don't copy everything
- >
- >An addq.w #1, and then a beq might prove to be faster than the cmp with
- >an immediate value. You have to adjust the mask back to its old value,
- >if the test fails, but this can be done either with the jump tables
- >(not with the ones you are using now, but the longer ones I will suggest
- >later in this article) or by a subq.w #1
-
- According to the Motorola manual, you're right. But in practice this slowed
- things down quite a bit. I can't figure out why. I replaced the CMPI with
- an ADDQ #1, then at @hardway I did a SUBQ #1. My test case is a 32x32 rect
- with a 32x32 filled circle as the mask. I think it would slow things down
- a lot more with more complicated masks. But still, I don't know why it's
- slower, since the CMPI takes 8 clock cycles and the ADDQ and SUBQ each
- take 4, so it really should be faster.... Then again, those timings are
- for the 68000 (and 68020 too I think), and I'm using a 68040.
-
- >> @hardway:
- >> ANDI.L #0xF0, D2 ; mask off the low four bits
- >> LSR.W #4, D2 ; shift bits 4-7 into bits 0-3
- >
- >The AND is totally wasted. The LSR will do the masking for you.
-
- :) I don't know *what* I was thinking....
-
- At this point, I ran my test again with the above modifications. They
- improved the speed by about 10%. (Changing the CMPI above decreased
- performance by about 30% with these other changes also in place.)
-
- >You can also eliminate the and and lsr, if you use two 256-entry jump
- >tables that simply ignore the high or low 4 bits. The tables will take
- >some memory (2 x 4 x 256 bytes), but they are easy to construct with
- >copy and paste.
-
- You mean two 16-entry jump tables, right? I didn't implement this, but
- instead made a separate CopyMask function that used a single 256-entry
- jump table, with 256 subroutines for each of the 256 possible mask-bytes.
- See the code fragment below.
-
- Hey, maybe I could just save 256 (times two) masks, AND each mask with the
- source and destination bytes, then OR those two results together to get
- the resulting pixel. Hmmm, I wonder if it would be even faster....
-
- >> ADD.W D2, D2 ; double the index
- >> ADD.W @table(D2.W), D2 ; calculate the address
- >> JSR @table(D2.W) ; plot four pixels
- >
- >The 68020 has addressing modes that do the multiplication of the index.
- >I haven't needed them myself, but I'm fairly certain that you can improve
- >this part with the right addressing mode.
-
- Nope, I don't think so. You're talking about Address Register Indirect with
- Offset and Index, like so: @table( <no address in this case>, D2.W*2 ).
- The problem is that the value of D2 is preserved in that operation, so instead
- of D2 = (D2 * 2) + @table + (D2 * 2), you get D2 = D2 + @table + (D2 * 2).
-
- >Replace the jsr with a LEA An to the return address and a JMP to the
- >subroutine. Then jump back with a JMP (An). This is quite a bit faster
- >than a JSR/RTS combination, although it's not "good style".
-
- Wow, that made a big difference! About a 17% improvement, making the total
- speedup about 25%.
-
- >> CLR.L D2 ; clear the mask register
- >> MOVE.B (A2)+, D2 ; copy the next mask byte
- >> ANDI.B #0xF, D2 ; mask off the high four bits
- >
- >Use BFEXTU, if you must read the mask again. Remember that you can use
- >-1(A2), if you already incremented A2 or you might be able to account
- >for this with the bitfield offset. You can also use constant bitfield
- >offsets, if I remember correctly. I think you have some registers that
- >you could use, so you could store fairly constant bitfield indices
- >there.
-
- I'm not sure what you mean by constant offsets. I did this:
- BFEXTU -1(A2){4:4}
- and it slowed it down by about 4%.
-
- >> @sub8: ; mask = 1000
- >> MOVE.B (A0)+, (A1)+
- >> ADDQ.L #3, A0
- >> ADDQ.L #3, A1
- >> RTS
- >
- >A move.b (a0),(a1) along with addq #4 is faster on a 68000, but I
- >don't think it matters on new processors. I may be wrong, but you'll
- >probably never see the difference.
-
- You're right, it didn't make any difference at all on my '040.
-
- >In the deep mask version, you could unroll the loop. It's kind of
- >surprising the the 1 bit mask is actually faster, but it's mostly
- >because of the superior algorithm that allows you to directly copy
- >8 bytes at a time in the most common case.
-
- I tossed that code. I don't really think unrolling the loop will get it
- down to my current speed, which is more than twice as fast.
-
- >it might be as much as 10%. The only way to go beyond this is to make
- >the move.l commands aligned on long word destinations, as I mentioned
- >in my previous article.
-
- But as long as I align the source and destination Pixmaps, that isn't an
- issue, right?
-
- >I hope my articles offer proof for the other half of my .signature... :-)
-
- Definitely :)
-
-
- OK, here's the new code. The first one is the newer, better version of
- Quick8CopyMask, with most of the optimizations suggested by Juri. It's
- about 5.5 times as fast as QuickDraw's CopyMask, at least with my simple
- circle mask test case. The second one is a small part of a very large
- Quick8CopyMask that has 256 separate subroutines to handle each mask
- byte, rather than only 16 subroutines to handle a mask nibble (a nibble is
- half a byte, right?). It's far too long to post here, but if you want a
- copy I'll be happy to email it to you. It's about 6.5 times as fast as
- CopyMask; about 15% faster than the short version.
-
- I tested the routines with the mask used in the CalcCMask DTS snippet;
- the short version was 5.7 times as fast as CopyMask and the long version
- was 7 times as fast.
-
- And once again, if anyone can improve on these routines, please tell me how!
-
-
- void Quick8CopyMask(
- PixMapHandle srcMap,
- PixMapHandle dstMap,
- Ptr mask,
- Point srcPt,
- Point dstPt,
- short width,
- short height )
- {
-
- register char *src;
- register char *dst;
- register long srcNewline;
- register long dstNewline;
- char mode32 = QD32COMPATIBLE;
- short w = (width >> 3) - 1;
- short e = (width & 0x07) - 1;
- short h = height - 1;
-
- // Set up pointers to the beginning of the memory to copy
- // and calculate the newline value for the source and destination
-
- src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3fff) * srcPt.v + srcPt.h;
- srcNewline = ((*srcMap)->rowBytes & 0x3fff) - width;
-
- dst = GetPixBaseAddr( dstMap ) + (long) ((*dstMap)->rowBytes & 0x3fff) * dstPt.v + dstPt.h;
- dstNewline = ((*dstMap)->rowBytes & 0x3fff) - width;
-
- // Switch into 32 bit addressing mode
-
- SwapMMUMode( &mode32 );
-
- // Copy the rect from the source to the destination
-
- asm {
-
- MOVE.W h, D0 ; put height loop variable in D0
- MOVEA.L src, A0 ; put the source pixmap address in A0
- MOVEA.L dst, A1 ; put the destination address in A1
- MOVEA.L mask, A2 ; put the mask address in A2
- CLR.L D2 ; clear the mask register
-
- @1: ; copy the next row
- MOVE.W w, D1
-
- @2: ; copy the next eight bytes in the row
-
- MOVE.B (A2)+, D2 ; copy the next mask byte
- BEQ @nocopy ; if zero, don't copy anything
-
- CMPI.B #0xFF, D2
- BNE @hardway ; don't copy everything
-
- MOVE.L (A0)+, (A1)+ ; copy all bytes
- MOVE.L (A0)+, (A1)+
-
- DBF D1, @2
- JMP @endloop
-
- @nocopy: ; copy no bytes
- ADDQ.L #8, A0
- ADDQ.L #8, A1
-
- DBF D1, @2
- JMP @endloop
-
- @hardway:
- LSR.W #4, D2 ; shift bits 4-7 into bits 0-3
- ADD.W D2, D2 ; double the index
- ADD.W @table(D2.W), D2 ; calculate the address
- LEA @rts1, A3 ; save the return address
- JMP @table(D2.W) ; plot four pixels
- @rts1:
-
- MOVE.B -1(A2), D2 ; copy the next mask byte
- ANDI.B #0xF, D2 ; mask off the high four bits
- ADD.W D2, D2 ; double the index
- ADD.W @table(D2.W), D2 ; calculate the address
- LEA @rts2, A3 ; save the return address
- JMP @table(D2.W) ; plot four pixels
- @rts2:
-
- DBF D1, @2
-
- @endloop:
-
- TST.W e
- BLT @4 ; continue if e is less than 0
-
- MOVE.B (A2)+, D2 ; copy the next mask byte
- MOVE.W e, D1 ; initialize the loop counter
- MOVEQ.L #7, D3 ; initialize the bit counter
-
- @3: ; copy the next byte
- BTST D3, D2 ; test the next bit in the mask
- BEQ @skip ; if zero, continue
- MOVE.B (A0)+, (A1)+ ; else copy the pixel
- SUBQ.L #1, D3 ; decrement the bit counter
- DBF D1, @3
- JMP @4
- @skip:
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- SUBQ.L #1, D3 ; decrement the bit counter
- DBF D1, @3
-
- @4:
- ADDA.L srcNewline, A0 ; bring the src pointer to the start of the next row
- ADDA.L dstNewline, A1 ; bring the dst pointer to the start of the next row
-
- DBF D0, @1
-
- JMP @end ; skip to the end
-
- @table:
- DC.W @sub0
- DC.W @sub1
- DC.W @sub2
- DC.W @sub3
- DC.W @sub4
- DC.W @sub5
- DC.W @sub6
- DC.W @sub7
- DC.W @sub8
- DC.W @sub9
- DC.W @sub10
- DC.W @sub11
- DC.W @sub12
- DC.W @sub13
- DC.W @sub14
- DC.W @sub15
-
- @sub0: ; mask = 0000, draw nothing
- ADDQ.L #4, A0
- ADDQ.L #4, A1
- JMP (A3) ; RTS
-
- @sub1: ; mask = 0001
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- MOVE.B (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @sub2: ; mask = 0010
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- JMP (A3) ; RTS
-
- @sub3: ; mask = 0011
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.W (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @sub4: ; mask = 0100
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0), (A1)
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- JMP (A3) ; RTS
-
- @sub5: ; mask = 0101
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.B (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @sub6: ; mask = 0110
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.W (A0), (A1)
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- JMP (A3) ; RTS
-
- @sub7: ; mask = 0111
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- MOVE.W (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @sub8: ; mask = 1000
- MOVE.B (A0), (A1)
- ADDQ.L #4, A0
- ADDQ.L #4, A1
- JMP (A3) ; RTS
-
- @sub9: ; mask = 1001
- MOVE.B (A0), (A1)
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- MOVE.B (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @sub10: ; mask = 1010
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- JMP (A3) ; RTS
-
- @sub11: ; mask = 1011
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.W (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @sub12: ; mask = 1100
- MOVE.W (A0), (A1)
- ADDQ.L #4, A0
- ADDQ.L #4, A1
- JMP (A3) ; RTS
-
- @sub13: ; mask = 1101
- MOVE.W (A0), (A1)
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- MOVE.B (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @sub14: ; mask = 1110
- MOVE.W (A0)+, (A1)+
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- JMP (A3) ; RTS
-
- @sub15: ; mask = 1111
- MOVE.L (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @end:
-
- }
-
- // Switch back to the previous addressing mode
-
- SwapMMUMode( &mode32 );
-
- }
-
-
-
-
- And this is the extremely long version, truncated for this posting:
-
-
- void Quick8CopyMask(
- PixMapHandle srcMap,
- PixMapHandle dstMap,
- Ptr mask,
- Point srcPt,
- Point dstPt,
- short width,
- short height )
- {
-
- register char *src;
- register char *dst;
- register long srcNewline;
- register long dstNewline;
- char mode32 = QD32COMPATIBLE;
- short w = (width >> 3) - 1;
- short e = (width & 0x07) - 1;
- short h = height - 1;
-
- // Set up pointers to the beginning of the memory to copy
- // and calculate the newline value for the source and destination
-
- src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3fff) * srcPt.v + srcPt.h;
- srcNewline = ((*srcMap)->rowBytes & 0x3fff) - width;
-
- dst = GetPixBaseAddr( dstMap ) + (long) ((*dstMap)->rowBytes & 0x3fff) * dstPt.v + dstPt.h;
- dstNewline = ((*dstMap)->rowBytes & 0x3fff) - width;
-
- // Switch into 32 bit addressing mode
-
- SwapMMUMode( &mode32 );
-
- // Copy the rect from the source to the destination
-
- asm {
-
- MOVE.W h, D0 ; put height loop variable in D0
- MOVEA.L src, A0 ; put the source pixmap address in A0
- MOVEA.L dst, A1 ; put the destination address in A1
- MOVEA.L mask, A2 ; put the mask address in A2
- CLR.L D2 ; clear the mask register
-
- @1: ; copy the next row
- MOVE.W w, D1
-
- @2: ; copy the next eight bytes in the row
-
- CLR.W D2 ; clear the mask register
- MOVE.B (A2)+, D2 ; copy the next mask byte
- BEQ @nocopy ; if zero, don't copy anything
-
- CMPI.B #0xFF, D2
- BNE @hardway ; don't copy everything
-
- MOVE.L (A0)+, (A1)+ ; copy all bytes
- MOVE.L (A0)+, (A1)+
-
- DBF D1, @2
- JMP @endloop
-
- @nocopy: ; copy no bytes
- ADDQ.L #8, A0
- ADDQ.L #8, A1
-
- DBF D1, @2
- JMP @endloop
-
- @hardway:
- ADD.W D2, D2 ; double the index
- ADD.W @table(D2.W), D2 ; calculate the address
- JMP @table(D2.W) ; plot eight pixels
-
- @endloop:
-
- TST.W e
- BLT @4 ; continue if e is less than 0
-
- MOVE.B (A2)+, D2 ; copy the next mask byte
- MOVE.W e, D1 ; initialize the loop counter
- MOVEQ.L #7, D3 ; initialize the bit counter
-
- @3: ; copy the next byte
- BTST D3, D2 ; test the next bit in the mask
- BEQ @skip ; if zero, continue
- MOVE.B (A0)+, (A1)+ ; else copy the pixel
- SUBQ.L #1, D3 ; decrement the bit counter
- DBF D1, @3
- JMP @4
- @skip:
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- SUBQ.L #1, D3 ; decrement the bit counter
- DBF D1, @3
-
- @4:
- ADDA.L srcNewline, A0 ; bring the src pointer to the start of the next row
- ADDA.L dstNewline, A1 ; bring the dst pointer to the start of the next row
-
- DBF D0, @1
-
- JMP @end ; skip to the end
-
- @table:
- DC.W @sub0
- DC.W @sub1
- DC.W @sub2
- DC.W @sub3
- . .
- . .
- . .
- DC.W @sub253
- DC.W @sub254
- DC.W @sub255
-
- @sub0: ; mask = 00000000
- ADDQ.L #8, A0
- ADDQ.L #8, A1
- DBF D1, @2
- JMP @endloop
-
- @sub1: ; mask = 00000001
- ADDQ.L #7, A0
- ADDQ.L #7, A1
- MOVE.B (A0)+, (A1)+
- DBF D1, @2
- JMP @endloop
-
- @sub2: ; mask = 00000010
- ADDQ.L #6, A0
- ADDQ.L #6, A1
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- DBF D1, @2
- JMP @endloop
-
- . .
- . .
- . .
-
- @sub182: ; mask = 10110110
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.W (A0), (A1)
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- MOVE.W (A0), (A1)
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- DBF D1, @2
- JMP @endloop
-
- @sub183: ; mask = 10110111
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.W (A0), (A1)
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- MOVE.B (A0)+, (A1)+
- MOVE.W (A0)+, (A1)+
- DBF D1, @2
- JMP @endloop
-
- . .
- . .
- . .
-
- @sub253: ; mask = 11111101
- MOVE.L (A0)+, (A1)+
- MOVE.W (A0), (A1)
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- MOVE.B (A0)+, (A1)+
- DBF D1, @2
- JMP @endloop
-
- @sub254: ; mask = 11111110
- MOVE.L (A0)+, (A1)+
- MOVE.W (A0)+, (A1)+
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- DBF D1, @2
- JMP @endloop
-
- @sub255: ; mask = 11111111
- MOVE.L (A0)+, (A1)+
- MOVE.L (A0)+, (A1)+
- DBF D1, @2
- JMP @endloop
-
- @end:
-
- }
-
- // Switch back to the previous addressing mode
-
- SwapMMUMode( &mode32 );
-
- }
-
-
- - --
- _____________________________________________________________________________
- Michael A. Kelly Senior Partner
- mkelly@cs.uoregon.edu High Risk Ventures
- _____________________________________________________________________________
-
- +++++++++++++++++++++++++++
-
- From: jmunkki@vipunen.hut.fi (Juri Munkki)
- Organization: Helsinki University of Technology
- Date: Wed, 18 Nov 1992 20:33:45 GMT
-
- In article <1992Nov18.010815.6649@cs.uoregon.edu> mkelly@mystix.cs.uoregon.edu (Michael A. Kelly) writes:
- > According to the Motorola manual, you're right. But in practice this slowed
- > things down quite a bit. I can't figure out why. I replaced the CMPI with
- > an ADDQ #1, then at @hardway I did a SUBQ #1. My test case is a 32x32 rect
- > with a 32x32 filled circle as the mask. I think it would slow things down
- > a lot more with more complicated masks. But still, I don't know why it's
- > slower, since the CMPI takes 8 clock cycles and the ADDQ and SUBQ each
- > take 4, so it really should be faster.... Then again, those timings are
- > for the 68000 (and 68020 too I think), and I'm using a 68040.
-
- On the 040, the instructions can overlap quite a bit. I guess that the
- modification of a data register prevented the overlap. I suggest that
- you try storing the constant 0xFF in a free data register and doing
- the compare with the data register. Register to register compares should
- always be faster than immediate to register compares.
-
- > >it might be as much as 10%. The only way to go beyond this is to make
- > >the move.l commands aligned on long word destinations, as I mentioned
- > >in my previous article.
- >
- > But as long as I align the source and destination Pixmaps, that isn't an
- > issue, right?
-
- I thought about this alignment stuff and it occured to me that the mask
- bitmap would be a lot harder to use if you aligned your writes to video
- RAM. On the Quadras, video RAM is so fast that alignment probably doesn't
- matter all that much. On NuBUS, things are usually quite different.
-
- > OK, here's the new code. The first one is the newer, better version of
- > Quick8CopyMask, with most of the optimizations suggested by Juri. It's
- > about 5.5 times as fast as QuickDraw's CopyMask, at least with my simple
- > circle mask test case. The second one is a small part of a very large
- > Quick8CopyMask that has 256 separate subroutines to handle each mask
- > byte, rather than only 16 subroutines to handle a mask nibble (a nibble is
- > half a byte, right?). It's far too long to post here, but if you want a
- > copy I'll be happy to email it to you. It's about 6.5 times as fast as
- > CopyMask; about 15% faster than the short version.
- >
- > I tested the routines with the mask used in the CalcCMask DTS snippet;
- > the short version was 5.7 times as fast as CopyMask and the long version
- > was 7 times as fast.
-
- It should be quite hard to improve speed from the longer code. I bet it took
- quite a few minutes to write it. :-)
-
- I do have an idea that you could try, if you still feel like the code should
- be improved.
-
- Snippet from long version:
- > @1: ; copy the next row
- > MOVE.W w, D1
- > @2: ; copy the next eight bytes in the row
- > CLR.W D2 ; clear the mask register
- > MOVE.B (A2)+, D2 ; copy the next mask byte
- > BEQ @nocopy ; if zero, don't copy anything
- >
- > CMPI.B #0xFF, D2
- > BNE @hardway ; don't copy everything
- >
- > MOVE.L (A0)+, (A1)+ ; copy all bytes
- > MOVE.L (A0)+, (A1)+
- >
- > DBF D1, @2
- > JMP @endloop
- >
- > @nocopy: ; copy no bytes
- > ADDQ.L #8, A0
- > ADDQ.L #8, A1
- >
- > DBF D1, @2
- > JMP @endloop
- >
- > @hardway:
- > ADD.W D2, D2 ; double the index
- > ADD.W @table(D2.W), D2 ; calculate the address
- > JMP @table(D2.W) ; plot eight pixels
-
- I finally dug up my 020 manual and went through the addressing modes.
-
- Instead of having a jump table, you should probably use a table of jumps. :-)
-
- clr.w D2
- @1
- move.w w,D1
-
- @2
- move.b (A2)+,D2
- jmp (@jumptable,PC,D2.w*4)
-
- @jumptable bra.w @mask0
- bra.w @mask1
- bra.w @mask2
- bra.w @mask3
- ...
- bra.w @mask254
- move.l (A0)+,(A1)+ ; This is mask 255
- move.l (A0)+,(A1)+
- dbf D1,@2
- ...
-
- I checked with Think C and at least the above code (or something similar)
- to it compiles and the disassembly looks reasonable.
-
- Note that i removed the special checks for 0 and 255. I think they are
- mostly wasted, but it's possible they speed things with masks with large
- solid areas.
-
- - --
- Juri Munkki Windsurf: fast sailing
- jmunkki@hut.fi Macintosh: fast software
-
- +++++++++++++++++++++++++++
-
- From: mxmora@unix.SRI.COM (Matt Mora)
- Date: 19 Nov 92 17:43:28 GMT
- Organization: SRI International, Menlo Park, California
-
- In article <1992Nov18.010815.6649@cs.uoregon.edu> mkelly@mystix.cs.uoregon.edu (Michael A. Kelly) writes:
- >
- >And once again, if anyone can improve on these routines, please tell me how!
-
-
- If your going to be calling this function a lot (like in a tight loop
- in a game to plot sprites) You can move the Swapmmumode code out of the
- function and call before you make the call and restore it afterward.
- That takes the two trap calls out of your fast code.
-
- like:
-
- swapmmumode(mode);
-
- while plottingsprites
- QuickCopy(sprites[i++]);
-
-
- swapmmumode(backtowhatiswas);
-
- If you can find a way to precompute the addresses (like a table of row
- starting addresses) that might help. They mention stuff like this in one
- of the develop articles.
-
- Matt
-
-
-
- - --
- ___________________________________________________________
- Matthew Mora | my Mac Matt_Mora@sri.com
- SRI International | my unix mxmora@unix.sri.com
- ___________________________________________________________
-
- +++++++++++++++++++++++++++
-
- From: mkelly@mystix.cs.uoregon.edu (Michael A. Kelly)
- Organization: University of Oregon Computer and Information Sciences Dept.
- Date: Fri, 20 Nov 1992 02:13:40 GMT
-
-
- >On the 040, the instructions can overlap quite a bit. I guess that the
- >modification of a data register prevented the overlap. I suggest that
- >you try storing the constant 0xFF in a free data register and doing
- >the compare with the data register. Register to register compares should
- >always be faster than immediate to register compares.
-
- I did that, and it doesn't seem to make any difference. My timings are to
- the 1/10000 of a tick.
-
- >It should be quite hard to improve speed from the longer code. I bet it took
- >quite a few minutes to write it. :-)
-
- About 120, although I didn't do it all in one sitting so it's hard to say :^/
-
- >I do have an idea that you could try, if you still feel like the code should
- >be improved.
- >Instead of having a jump table, you could probably use a table of jumps. :-)
-
- It did make a very slight difference in the long version, but no difference
- it the short version. I think this is mostly because I still have to deal
- with the mask in the short version, so it really didn't change much.
-
- >Note that I removed the special checks for 0 and 255. I think they are
- >mostly wasted, but it's possible they speed things with masks with large
- >solid areas.
-
- Removing them makes the long version faster, and the short version slower.
-
- Here's the new long version, in its entirety. Enjoy.
-
- (This file must be converted with BinHex 4.0)
- :$%0[F(P0BA0V,Q0`G!"338083e"$9!!!!!!@#!!!!!$YpJ%"!2%!!"A1J$CQGfG
- BB!L!H!F'!)F!"JF!#!K`!!K3L!F!!!!!C`H!F)GQ"SGQG`L'!!F)#(!'D'CRD
- )#)KhChKi!(J!K`L)J!!!!!#!#!"h!!#!B!!!!!!!#!#!!!!!!)#!!(!!!!!!!)!
- !!(!!!!!)F)!!#!!!!!!!!)!)"`!!J!!!!!!(#!!"0$"8"38&!a)d3)LFJ69!U"0
- 5P'K&L,NhDL3#G2&%d@Z5Iq1cTGMj9*GERM*hIAjVhRRLZhhYLhhqlAeBq(aTE[p
- !f3k5TrAYql(%I[p%jaFm5h-kA8L6djVNaAr-e'ebGAPdBUeSc#BPZF&LYKa@12$
- k-9#D*TEA**rm8&aljDd6YIjl9Yr+50bNZe*9p[4d4QA0$4kf'Qf!qk`"4'GVadV
- 1jYhEfi$H[4M)M2LK1rNrK1hkNlQRk4XH*GLJ6QaXpGlfi[l2db[%#Xf1j%@GI`q
- !'8U(q*(+[q2bjHR[B,ZrJ[e2*l+QRCZqVI`Ehf!!!!"#!!"iH+B!"J!!!!!!!!!
- !!%!+!)ThLAPiBd9QC9D)#B#(H(LUUU#'UALALEQ3!!!!!!9fCAKPZ'GPB'9AKiQ
- C#3!)N!!J!!4&9@ChGN0&4BHSHAL)L(QQCjH(LCZAHDKSX*KiH)-r46099RChGRH
- *L)H'4@CQCfChLCQ)PjKiGkL*HBZVUCQUQCQULELCUkQ3!*!!QJLCTkLBQCQCHAU
- E#VS!!!#kc!RdHI(,cbq(KEUqGAYZhrYTV[qC%5)JH,il,%dCr9FM-Zk[[YfrY+[
- KC,[m0@`L)Ai@F*I5cKIEGZ[PGh5D@#iX0PH%YGflr"8'PUfflppZljFdYGZ'bcJ
- NKGQ'VRP)V+ll[$KUXPAZebefAi@lUm#-8+`IVYqZQ[IHNcHckll0@&QZ@&dYPId
- XN4K8VkqqbAED4NeQr$BSa$pC@lNNHh%4iV"q&QrKCICZ`[54GH`THR*4qN`pV"8
- P!2cY,AqmV9EEf!@l5XEVeB%E3[$Pp[lNBJQcMJ4+UphbfN'9K'KbeAElE0DM&[4
- Mar5kh@$dH6'Sd6rC!TYXaAm04%0jJ4EAIJSLf($&p[kiLKdU(A@lX-@)UDpH$5L
- "%[54#rCG`+L(MEV`f0+)E,,IPX`Pbiriaib0mq9U89HVCA`aHFM$28ULT')Y5+l
- EYhb+cAreRMYYh@+Sf4,fK'dQ"BXAIGVXSR,m*I$USRlIadpI4mIqG2[pa%KALBL
- 1mL3VPkrA+MPPpmSD8IXD%ImdXReb8R5(B4)9#dQM12&k24,XX`PiEjEbYE*F-,%
- 19pID3BlYaAbU@QZqc[Zi%LB*F0)pY*1bfTY9Hh9iEDm%lCfj-V5qPHh`XPhAF(G
- 'U8!%E)3+rkX`*%GdehfG'[A`j**a55jCIG,N+h$PPbFKAF%4APqrem,[(TrE#bp
- )c8GhGhFT,ib88A2p#*NQK@`TiA["+eTiX#qq56HNQUNHmX&8ii`#UD+YJ95MR5#
- UD&&Bqd4GTK9hX#AK90p2MEJ6UF%I2G)PUNYlZXJDX[[+qBYVmSpl2'[ITdrRT)K
- r*,c+4UmUk+dXLbqm9,ZiAGl8q&5,X(b9hpm[i+K1,6q2rhZjrdaE1DA9N!#b6HU
- @r``A4TlEVYj)Z1&YID5r!6C!kXLLCp(2lb[-jTG$5Qc5cIIEpHmMQP#D91qKU6X
- MBL*h"NlDYPeU6Q'K18YQ)LG6*FlD%j5fGfT-TN6,fqrmL-TkTNC*j286FYXV6Ym
- T*DZ[GN5M(LpNHNL5rf)NUja@kcki5)reSE"iNE!8%%5kE@PbEFHe++j6FT+`&F`
- ,TaFR42PqiM"NafC28JS92#*Irh6eiXA*l-#ZcjZ[fmh92RmI2Pj5*TrriGqpG%K
- A[hm,Yj(XN[5hq(E`Zm#0l+Rj"$B6Er)rRdT43I(Xq*!!IP4VTpraaB[CP8DKGZ)
- Q*,k&[G)L#hbfPEi5H(G)Mk(r$C4XpK@cTa,@lVF,DpT9U+Q58CHUl`,XSU*Km#2
- Jr[54-U'Y-Lh69kBN+p$@lK3!G*!!dBZUK)9*-0)mK1dkQNa''+2@+0LTMl[J4Te
- rcYhU-plTI[C`ZjPl@QTSQ3V416SMPqiV4V0YpM3"LA+V0T%Yl2ck8LM$38GGQVK
- ChNUL38-A9drj)JT%YpP#k,AfC5'BS5)rV&dGA8ZBHPGkDN3,E*G@ANC2P8C)TI)
- 8fIdVjj!!aGTFCUGmmNTUN!!GG(hIKA``A3#DEY5M$QME05[a-'(&h"R"KbmDC#$
- 416T4Xa,MkC`+fYF`FL+Th28V["5ldrPd%YSq`PedVIfK$LX-D%1,X90#(&H@d)F
- 4jl5KfCJKfGJKe,"$UQ#(D$-q!'Q"d'"m'#%'#-'#8'#F'#N'#X'#dc"DB0N-&TQ
- #dc"DCJY-`@QB,6-&TQ#e"JY3B,8!rX`@S-&U$"DJ`@S-&U$"DJ`@bQ#f8`@bQ#f
- 8)dM"E+B,C6"E+B,C6"E+B,CM"E-B,CM"E-B,CJZ8Q#fB`@c'#fB`@c'#fF`@cQ#
- fF`@cQ#fF`@cKH!-&XjJYR-&XjJY5B,8Q#e*JY5B,8Q#e*JY5'`ZB,8Q#e*JY8B,
- 9'#e4JY8B,9'#e4JY8B,9"Xh'#e4JYS-&Y"JYS-&Y"JYS-&Y"JYS-&Y"JYS$!+I!
- - -'!I)'!M)'!R)'!V)'![)'!c)'!h)'!l)'!r)!J9PR!3(PT2,8H@XmYKjE6bh(P[
- #A#!PaJ*FS#A1!PdJ*GB#AD!PhJ*H)#AQ!PkJ*Hi#Ab!PpJ*IS#Aq!8#!8'!8+!8
- 1!85!8@!8D!8H!8L!8Q!8U!8Z!8b!8f!8k!8q!9#!9'!9+!91!95!9@!9D!9H!9L
- !9Q!9U!9Z!9b!9f!9k!9q!@#!@'!@+!@1!@5!@@!@D!@H!@L!@Q!@U!@Z!@b!@f!
- @k!@q!A#!A'!A+!A1!A5!A@!AD!AH!AL!AQ!AU!AZ!Ab!Af!Ak!Aq!J"-)!6#!%`
- J"-)!6#!%`J"-)!6#!%`J"-)!6#!%`J"-)!6#!%`J"-)!6#!%`J"-)!62Q`2Qa9c
- C!)$jY$jY6jYMjYcjZ$jZ3J"-)!6#!%`J"-)!6#!%`J"-)!6#!%`J"-)!6#!%`J"
- - -)!6#!%`J"-)!6#!%`J"-)!6#!%`J"-)!6#!%`J"-X!#q!dU&jLpdj,a&qmhL,qh
- pFIU6mB[i5JRj0[m5GKhr5XhArH-AaTNmX+E+-KH9lbVf)C%f),[$Y8(L'+L"5Ql
- f0+-C5r4eJaRbP6TF`SR-!`Bd9%&hRa$PmIKQ#AQIB-`idYJjFb(1$-5SJ$-8fB0
- +56"0#SJi(p,bl!$fTS$`"rk29S'99Q2N6iEec)X`-b+L$Fd%Rk-X%d+L$Jd(-l!
- Je0#CK"XHLpl@fGB6-,@l`%bEk"EX`#&,X#$NhCTD%picUP34a%M1DQFj3"*83ER
- rYLD-3MSD`Le0DCK&XI8pid)4J3)+'%D6-8-ER!l2TL-MFp+4KGFm5-cYc3LAAL*
- HrFAA1fq!q,Fm)Pi+%R&[6Pe%R5h3KEb#Kk@k&Ah$E+IB8)J2a%akF6c&$rfkSZC
- SG95G5JV1(lHJUdCXh1Q3!%MUZ8*'f[QB5Bha(*[4eh5&$#0CQ+'DD1XHKi@*NBA
- A2&0'Hep-m,#LJfem,HR'&Miaj62#a-M#kiD-KhhPjF20!NA"ZS0p*![L-4McU*N
- BAa16VR5l-HHSYmXf3YMHD'8i`521SQ%R&b$PLfG,Ibe"!C!!aQ+"Y@1FTdeU2!b
- '#4KP1A#KDM`-KMpl``j)Rfe$`C)K&!j1(Mq+ke'K1ZfiZQ6hd"f$M"*DJ!RHf
- I6MfH6HK@DdZ6CU",1P4XjU'3!%9BF*C4fU1TQ%Rlm36)8+`L5bZR$$Q+"lN6qHI
- &@CC)M#kjbr`-b»@H-AA$`T2je9+&CPNL-,VKTF(*X[,LXj5A(q,iKUQImq*
- @HK)M#q*dSFF[BFV23Pb0Lm3GJ*r,,-UY$L4'&iJ[i(%8[,Le0iF$`P`@S$#Th(T
- E#`'9)3N*'GC1&Uc!i4K&$MhXR@6KDX`1'35-2IiGC1&V!!pI3PaM'8-M6l&aDaB
- 2fZ%Z)64RRVbSVL2TLCfV(aRXbS`h(da-`N1HaEkV)`X(da$X@$RXIJM)`X(da#4
- Kcf,K3LU3!2TL(D8(2Br"'46JI6%*'(2Bkh8%8i(da#A"cf2`4N8i(da#4Kcf2%I
- )T52TL'KiFDbi8)SH2Ti%8$LiH)q@SdIV0J1)393H4F@TqIV-N!#A"@E1LT3[1Q,
- fKi9RB+%8l&&!ZQ(UH24db+M#HTiA5MDjHMZd9"rEqZ0p$aR`FjcdAPi0fS%%6jJ
- ##SJ24'm!JHIP,TcDVD3K&J%A52`mm5IY*1'(-8-Fc`fa6%Bp!#C'&eca)cf`)(R
- U6q"iZZHh&)q1ZTR21SQ4KGF1LSH$CFA+6cj*kUaI%blB![L-4Md)*NBAa-Ra$`E
- *Mip##Id(&iMflDcia+8RSC6)`[%(Z4$LbAPa&0TqL)5i2Fi'&6Z25-+jqL)6d*k
- GC1%9Q(k)K,MMq'6V*`LX`r4%*'(`m1XR#eJ!G%3PaX,Jjb@4F@X@$SL%Z),M226
- 8LZ)r4%c&$2D)E1a!b)b+iMqaB5-2e'I&fDT&J"qNS5i2eF'3!%Gak4CSITq%p#H
- Ra-p8Lc3rYb%Z$B!2D`Gak4CSIh6#HK23XSIbd8M&3IhY#e!5i2EBGak4MF2m$K2
- 3RSH3!-@15-H4rL%*b(rr*q#-M'iIi`#4KrXTmAMY#fkJ2)B-M3dh1-5fKEUL$b'
- #HKTZDLiYe4"j$(cXE$b4',Lfl-2*%*F(NiIkFPYh`[dK%j2!j*pT`F!aL-YZN!"
- 2JS*&HVpfI*[S9aDTDNAQ2AEDUT!!63U)20abYKLI[e%*$VL@YB5I6H)S(A%rL
- f+eK*rZQ*'(A%J8%#a46aQ&,mbmZPQipQN!!MT91+NFAB$VN(M!D#kP8kFZMG6(A
- +cI)P83U4aI)1ZCqAM*8UQ(lpIBf138GdeS)k9+bT(&i`kk"am0"G5Tf1A4Ze$RT
- (Z5Z-6#@'"8pM%`*l(A8d&e+Y8#k-d`kkhY*iP@XU4aI)1Z`C2&jGKD`TqYGSrjV
- (ADh&eDb"qYGc-ZXmqm3LZakklh$lH@cKpEa5LXjkkdR!8XH(jcG3LXjkr28I"k0
- bjY,U+j(Vmf",VSYd)V'HZVXIap(LUE5kM$3pG6`PdFI,R%JM$mpG3)R-H0CY,U+
- e(VmT",SmGLT9QV@BAcI[5m%lI-Dq1Eq*Ui3NJQiZSGS6(b!1`8Ui3LLPkA,R,dN
- M!4Y)kZ'")MLl!I)%eLPA#86dd#YVRDZEa3V8U88$X81PUecG-V8S4$R+iS1e+hL
- G@PiRkM$Z&MdU03H#JN[#d821qq*8!b+L$Ldi*R#X`iflBeE0b9dYH8"!j,i&*ED
- ,["$(FD(Da!6"#&-60c-RJ*RYY)8"!"2j6TE5,[",0B(@5UJ&J8%lFc-Caal#5mH
- JZLBpIHfQaQp&dJ#F&c8%iZN(HQD3!2JULImd[[EUCrFFkrRY3#U+#Gr@CMqb6Qk
- 49Fd[S`,T"pH3!1$&F@eCRNLf,f"pD6cHaP9R85rP0m[B(eC0BR9R95%ljG)2fZ2
- bk4@JT,qX"G)22NiZk9SJ)28re*HipHfrVP9RX**-rd%XqAYXKbhLG@Ha+"Ji-Kr
- [*aG)LJTGcR#241)h##PY&(blR-%6MUVEakLMjGcP#24Sq(HF+@d8I,ZFJ5f*E(,
- Sqaac)TbAFi`R&RAH-4J&Y&8blR%%YLpKdNkc)Um5rl!1`'-0N!!R49-Pp+a%i`f
- HBp@`Y-FcJqq)Bkh!YV9R-FcJPXBE,akY@Fac12Ij1MQb!HVBF'1E)4k1FKj&0DX
- B[Z0G5QjDH@m6VB8LHF-AFMk[&iHcUdrTHL%BAMm*8#C@NT2bD#*amXcc(UY'LIK
- 6%HMiK4P38,DY0#IH0%YMh[hQ29DD%q-84k00!lr5KE9TS6hr",BPXFic`1+Y6LG
- MBJNF4Sa'+&Y@VT1V`5f-0Ae*dV9dRXFL8aD129S8,DZ'j0q!LF5f2@'GjaT@X"B
- rCF*a&XSd*1pTkf+TMa4#*a,EVI-T@a[-H0B4kirN!khc+9XEc(Md%YMrTYmhc&Y
- e*Mm@K1,'f50kjY1+fl3aq4`RD2E4h5M(Sbe"-ikhA4j5M'!be"-a11Maf$e'+
- TPU#(CU(4ii&Y'+TPU#%YMSmA$fT&Qc,8%1#FG(MJ@d@"-Y33PXG(MV**8L`*PU#
- %iMSmF#fL`*PU#%YMSmH464A)be"$K`(VT,akM$Jbe-#24kfMb+DeB`k!"k`!cF2
- ZF9V%"l83R%CcL&0&NaIF5-3Jr0TaHJL`JSp&k!r0Tp5+D%@2%pkJ[2Ep`)EZVD8
- N@+N3m5L1X4S$Tk`5m3km1Zf,DS!k9%(&TakPd[idEUBU3CN9%"MS25BRr#bq1Pb
- d($DpE41HMC-6[lbDZQh4S22BRr`PZEJaj&akq`#Fmd5BR(lIKbR1,T!!maLImT2
- PdJm$6@*c`f6%iZN(Xc,`r2)H!5Ip0YcKQIhL2LVbH3kVTLFHr@08cLk4#lX1RS4
- k154e`C6b)QKBG)`PXFKcF286V-2BNkFXSh2!%k*eQ(3m)R'jih$eDJTUrHSIc*(
- X'Y`p@SJDY1c-HXprQH4%pM$5%2NhC3jEa1LHaKT#%6ZA8#FR53LJTKT#%HLFAcF
- )K&(c$5%*EF[2bj[B44m`dK#24SqGGjbF)SqBD3K,BPXFZM@F88j-0)3R%I8fqSM
- %96-0)3PYdH`q*1N9H-0)3qh`BE0!R49-`rD`41-0QiHVB@Q1C`G8!aeZ"E@V1Bj
- R",B`fA$fDeCc(-ipriG(0N!p@`i-Ff3MdFj$b+DeBaIhqG35RP,(C-!6VB8LHR)
- @hMk`'J2+drTFk$fl6$Ck0@m%+d@*qQPl!Pqr*8hKp@IK2d-$riBI"@makV3FRjX
- 4(V,ZMYiI9R`6rf82rdSrC#makV1NRcAL24U2I0bbK@HK2a[#8aUlHBp9RS6mE`M
- ebRqFhC+cP*q'BHCjkGYGeXDS9RJ6r!X4k2F[I9d*DK9'8)5Q,4a@)$LY6mM+`*a
- (ZYZF3Le@5-Y`P-FYcD29UXNCEK(SlrlQp"E$-MSf%TL`TcZl,BI%H``35kF,H8q
- %J['kdKpfm9P3!NU)20U@8p2a@CY+Q1eQG)-c+L$kMZbRT55",4(Fl35fF33aK0S
- R24iQ*fjZ$`%lfflXTjmLJPTflR,lQ"i,Qd6RQS6%lqphmCaR0dKjNNKkqpY4i$d
- H!*V%jiH*LFA5$fE'N!$ji%%lpc6fDeGZSFQlD#$bqT!!)EGc)FJ6NfE-LAcb+lA
- ic0$YUYSm4!8JmIrFd1q`l!mL$A'h4rM[40B(3[A'h4r$el(jE)L"hPEfaM%Xm!i
- 81N3hdP%hmP3AdHLACifiDYNZf[9mjBA5`f@5hm,2TEGiAbVekq&PppZlj5llYGK
- 8bl2'[ITdrRT)Kr*,c&4DLFZAeBmImJ!!!!)G1+d!!3!&FA9TBfX"!!!!#&4&@&4
- ,38K-Tc&rQUFaIj)"!,pi'U3!"J!!!C)!!0M9!!!"GJ!!&&!R5J!!:
-
-
- - --
- _____________________________________________________________________________
- Michael A. Kelly Senior Partner
- mkelly@cs.uoregon.edu High Risk Ventures
- _____________________________________________________________________________
-
- +++++++++++++++++++++++++++
-
- From: Steve Christensen <stevec@apple.com>
- Date: Fri, 20 Nov 1992 04:06:08 GMT
- Organization: Apple Computer, Inc.
-
- I took a pass at the code and here's what I came up with (changed lines
- will have a * in the comment field).
-
- MOVE.W h, D0 ; put height loop variable in D0
- MOVEA.L src, A0 ; put the source pixmap address in
- A0
- MOVEA.L dst, A1 ; put the destination address in A1
- MOVEA.L mask, A2 ; put the mask address in A2
- CLR.L D2 ; clear the mask register
- @1: ; copy the next row
- MOVE.W w, D1
- @2: ; copy the next eight bytes in the
- row
- MOVE.B (A2)+, D2 ; copy the next mask byte
- BEQ.S @nocopy ;*if zero, don't copy anything
-
- CMPI.B #0xFF, D2
- BNE.S @hardway ;*don't copy everything
-
- MOVE.L (A0)+, (A1)+ ; copy all bytes
- MOVE.L (A0)+, (A1)+
- DBF D1, @2
- BRA.S @endloop ;*
-
- @nocopy: ; copy no bytes
- ADDQ.L #8, A0
- ADDQ.L #8, A1
- DBF D1, @2
- BRA.S @endloop ;*
-
- @hardway:
- MOVEQ #0xF, D3 ;*mask off the lower nibble for
- later
- AND.B D2, D3 ;*
- LSR.W #4, D2 ; shift bits 4-7 into bits 0-3
- ADD.W D2, D2 ; double the index
- MOVE.W @table(D2.W), D2 ;*calculate the address
- LEA @rts1, A3 ; save the return address
- JMP @table(D2.W) ; plot four pixels
- @rts1:
-
- ;*******MOVE.B -1(A2), D2 ; copy the next mask byte
- ;*******ANDI.B #0xF, D2 ; mask off the high four bits
- ADD.W D2, D2 ; double the index
- MOVE.W @table(D3.W), D2 ;*calculate the address
- LEA @rts2, A3 ; save the return address
- JMP @table(D2.W) ; plot four pixels
- @rts2:
- DBF D1, @2
-
- @endloop:
-
- MOVE.W e, D1 ;*
- BLT.S @4 ;*continue if e is less than 0
-
- MOVE.B (A2)+, D2 ; copy the next mask byte
- ;*******MOVE.W e, D1 ; initialize the loop counter
- MOVEQ.L #7, D3 ; initialize the bit counter
-
- @3: ; copy the next byte
- BTST D3, D2 ; test the next bit in the mask
- BEQ.S @skip ;*if zero, continue
- MOVE.B (A0)+, (A1)+ ; else copy the pixel
- SUBQ.L #1, D3 ; decrement the bit counter
- DBF D1, @3
- BRA.S @4 ;*
- @skip:
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- SUBQ.L #1, D3 ; decrement the bit counter
- DBF D1, @3
-
- @4:
- ADDA.L srcNewline, A0 ; bring the src pointer to the
- start of the next row
- ADDA.L dstNewline, A1 ; bring the dst pointer to the
- start of the next row
-
- DBF D0, @1
-
- JMP @end ; skip to the end
-
- @table:
- DC.W @sub0-@table ;*
- DC.W @sub1-@table ;*
- DC.W @sub2-@table ;*
- DC.W @sub3-@table ;*
- DC.W @sub4-@table ;*
- DC.W @sub5-@table ;*
- DC.W @sub6-@table ;*
- DC.W @sub7-@table ;*
- DC.W @sub8-@table ;*
- DC.W @sub9-@table ;*
- DC.W @sub10-@table ;*
- DC.W @sub11-@table ;*
- DC.W @sub12-@table ;*
- DC.W @sub13-@table ;*
- DC.W @sub14-@table ;*
- DC.W @sub15-@table ;*
-
- @sub0: ; mask = 0000, draw nothing
- ADDQ.L #4, A0
- ADDQ.L #4, A1
- JMP (A3) ; RTS
-
- @sub1: ; mask = 0001
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- MOVE.B (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @sub2: ; mask = 0010
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- JMP (A3) ; RTS
-
- @sub3: ; mask = 0011
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.W (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @sub4: ; mask = 0100
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0), (A1)
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- JMP (A3) ; RTS
-
- @sub5: ; mask = 0101
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.B (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @sub6: ; mask = 0110
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.W (A0), (A1)
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- JMP (A3) ; RTS
-
- @sub7: ; mask = 0111
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- MOVE.W (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @sub8: ; mask = 1000
- MOVE.B (A0), (A1)
- ADDQ.L #4, A0
- ADDQ.L #4, A1
- JMP (A3) ; RTS
-
- @sub9: ; mask = 1001
- MOVE.B (A0), (A1)
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- MOVE.B (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @sub10: ; mask = 1010
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- JMP (A3) ; RTS
-
- @sub11: ; mask = 1011
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.W (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @sub12: ; mask = 1100
- MOVE.W (A0), (A1)
- ADDQ.L #4, A0
- ADDQ.L #4, A1
- JMP (A3) ; RTS
-
- @sub13: ; mask = 1101
- MOVE.W (A0), (A1)
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- MOVE.B (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @sub14: ; mask = 1110
- MOVE.W (A0)+, (A1)+
- MOVE.B (A0), (A1)
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- JMP (A3) ; RTS
-
- @sub15: ; mask = 1111
- MOVE.L (A0)+, (A1)+
- JMP (A3) ; RTS
-
- @end:
-
- ---------------------------
-
- End of C.S.M.P. Digest
- **********************
-